package com.lyf.zone;

import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;

import org.apache.commons.lang.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

public class AreaPage implements PageProcessor {

    private Site site = Site.me().setCharset("UTF-8").setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

    /**
     * csv 输出路径 如果不需要csv则不需要配置
     */
    private String csvPath ;
    
    /**
     * sql 输出路径 如果不需要sql则不需要配置
     */
    private String sqlPath ;
    
    /**
     * 最大下爬层级 默认为社区
     */
    private int maxLevel = 4 ; 
    
    /**
     * 是否csv输出到一个文件
     */
    private boolean outFull  = true ;
     
    public String getCsvPath() {
		return csvPath;
	}

	public void setCsvPath(String csvPath) {
		this.csvPath = csvPath;
	}

	public String getSqlPath() {
		return sqlPath;
	}

	public void setSqlPath(String sqlPath) {
		this.sqlPath = sqlPath;
	}

	public int getMaxLevel() {
		return maxLevel;
	}

	public void setMaxLevel(int maxLevel) {
		this.maxLevel = maxLevel;
	}
	
	public boolean isOutFull() {
		return outFull;
	}
	
	public void setOutFull(boolean outFull) {
		this.outFull = outFull;
	}

	@Override
    public void process(Page page) { 
       
       String url = page.getRequest().getUrl();
     
         
       // 1 省 2-地市 3-区县 4-街道 5-村
       
       String className = "" ;
        int level = 1;
        
        String html = page.getHtml().get();
        if(html.contains("provincetr")) {
        	level = 1;
        	className = "provincetr" ;
        }else if(html.contains("citytr")) {
        	level = 2;
        	className = "citytr" ;
        } else if(html.contains("countytr")) {
        	level = 3;
        	className = "countytr" ;
        }else if(html.contains("towntr")) {
        	level = 4;
        	className = "towntr" ;
        }else if(html.contains("villagetr")) {
        	level = 5;
        	className = "villagetr" ;
        }
        
        if(level > maxLevel) {
        	return;
        }
        String parentCode = null ;
        if(level > 1) {
        	 String codeUrl = url.substring(url.lastIndexOf("/") + 1);
             
             parentCode = codeUrl.substring(0, codeUrl.lastIndexOf("."));
             parentCode = StringUtils.rightPad(parentCode, 12, "0"); 
        }
       
       
         
        List<AreaVo> vos = new LinkedList<>();
       
    	List<Selectable> list = page.getHtml().$("." + className).nodes();
    	 
    	List<String> links = new LinkedList<>();
    	
    	for(Selectable s : list) {
    		List<Selectable> tds =  s.$("td").nodes();
    		
    		String name = "", code ="";
    		
    		String href = null ;
    		for(Selectable td : tds) { 
    			String value ;
    			if(level == 1) {
    				href = td.$("a", "href").toString();
    				 
    				if(StringUtils.isBlank(href)) {
    					continue ;
    				}
    				links.add(href);
    				name = td.xpath("//a/text()") .toString();
    				code = href.replace(".html", "");
    				
    				if(code != null) {
    					code = StringUtils.rightPad(code, 12, "0"); 
    				}
    				
    				System.out.println(code + "," + name);
            	    AreaVo vo = new AreaVo();
                    vo.setLevel(level);
                    vo.setParentCode(parentCode);
                    vo.setCode(code);
                    vo.setName(name);
                    
                    
                    vos.add(vo);
    				
    			} else  if(level < 5) {
    				href = td.$("a", "href").toString();
        			if(href != null) {
        				links.add(href);
        				value = td.xpath("//a/text()") .toString();
        				 
        			} else {
        				value = td.xpath("///text()").toString();
        			}
        		    		
        			if(value.matches("\\d+")) {
    					code = value ;
    				} else {
    					name = value ;
    				} 
    			} else {
    				value = td.xpath("///text()").toString();
    				
    				if(value.matches("\\d+") ) {
    					if(value.length() > 10)
    						code = value ;
    				} else {
    					name = value ;
    				}
    			}
    			
    			
    		}
    		
    		
    		
    		if(level > 1) {
    			// 如果hreaf 为空 并且 名称为 市辖区 则跳过
        		if(StringUtils.isBlank(href) && StringUtils.equals(name, "市辖区")) {
        			continue ;
        		}
    			System.out.println(code + "," + name);
        	    AreaVo vo = new AreaVo();
                vo.setLevel(level);
                vo.setParentCode(parentCode);
                vo.setCode(code);
                vo.setName(name);
                
                
                vos.add(vo);
    		}
    		
    		
    	}
    	
    	// 判断输出csv
    	if(StringUtils.isNotBlank(csvPath)) {
    		
    		
    		
    		List<String> as = vos.stream().map(t->t.toString()).collect(Collectors.toList());
        	
    		String path = csvPath;
    		if(!outFull) {
    			path = path.substring(0 , path.lastIndexOf(".") ) + "_" + level + ".csv";
    		}
    		
        	WriteUtils.write(as, path);
    	}
    	
    	// 判断输出sql
    	if(StringUtils.isNotBlank(sqlPath)) {
    		List<String> as = vos.stream().map(t->t.toSql()).collect(Collectors.toList());
        	
    		String path = sqlPath ;
    		if(!outFull) {
    			path = path.substring(0 , path.lastIndexOf(".") ) + "_" + level + ".sql";
    		}
        	WriteUtils.write(as, path);
    	}
    	
    	
    	try {
			Thread.sleep(10);
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
 
    	 if(level+1 > maxLevel) {
         	return;
         }
    	
		if(!links.isEmpty())
			page.addTargetRequests(links , 500);
    	
    }

    @Override
    public Site getSite() {
        return site ;
    }
    
     
 
}
